{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Categorical Data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import copy\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sklearn.preprocessing import LabelEncoder, LabelBinarizer\n", "import category_encoders as ce\n", "\n", "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Identifying Categorical Data: Nominal, Ordinal and Continuous" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df_flights = pd.read_csv('https://raw.githubusercontent.com/ismayc/pnwflights14/master/data/flights.csv')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearmonthdaydep_timedep_delayarr_timearr_delaycarriertailnumflightorigindestair_timedistancehourminute
02014111.096.0235.070.0ASN508AS145PDXANC194.015420.01.0
12014114.0-6.0738.0-23.0USN195UW1830SEACLT252.022790.04.0
22014118.013.0548.0-4.0UAN374221609PDXIAH201.018250.08.0
320141128.0-2.0800.0-23.0USN547UW466PDXCLT251.022820.028.0
420141134.044.0325.043.0ASN762AS121SEAANC201.014480.034.0
\n", "
" ], "text/plain": [ " year month day dep_time dep_delay arr_time arr_delay carrier tailnum \\\n", "0 2014 1 1 1.0 96.0 235.0 70.0 AS N508AS \n", "1 2014 1 1 4.0 -6.0 738.0 -23.0 US N195UW \n", "2 2014 1 1 8.0 13.0 548.0 -4.0 UA N37422 \n", "3 2014 1 1 28.0 -2.0 800.0 -23.0 US N547UW \n", "4 2014 1 1 34.0 44.0 325.0 43.0 AS N762AS \n", "\n", " flight origin dest air_time distance hour minute \n", "0 145 PDX ANC 194.0 1542 0.0 1.0 \n", "1 1830 SEA CLT 252.0 2279 0.0 4.0 \n", "2 1609 PDX IAH 201.0 1825 0.0 8.0 \n", "3 466 PDX CLT 251.0 2282 0.0 28.0 \n", "4 121 SEA ANC 201.0 1448 0.0 34.0 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_flights.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 162049 entries, 0 to 162048\n", "Data columns (total 16 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 year 162049 non-null int64 \n", " 1 month 162049 non-null int64 \n", " 2 day 162049 non-null int64 \n", " 3 dep_time 161192 non-null float64\n", " 4 dep_delay 161192 non-null float64\n", " 5 arr_time 161061 non-null float64\n", " 6 arr_delay 160748 non-null float64\n", " 7 carrier 162049 non-null object \n", " 8 tailnum 161801 non-null object \n", " 9 flight 162049 non-null int64 \n", " 10 origin 162049 non-null object \n", " 11 dest 162049 non-null object \n", " 12 air_time 160748 non-null float64\n", " 13 distance 162049 non-null int64 \n", " 14 hour 161192 non-null float64\n", " 15 minute 161192 non-null float64\n", "dtypes: float64(7), int64(5), object(4)\n", "memory usage: 19.8+ MB\n" ] } ], "source": [ "df_flights.info()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "df_flights.boxplot('dep_time', 'origin',rot = 30, figsize=(5,6))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "cat_df_flights = df_flights.select_dtypes(include=['object']).copy()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
carriertailnumorigindest
0ASN508ASPDXANC
1USN195UWSEACLT
2UAN37422PDXIAH
3USN547UWPDXCLT
4ASN762ASSEAANC
\n", "
" ], "text/plain": [ " carrier tailnum origin dest\n", "0 AS N508AS PDX ANC\n", "1 US N195UW SEA CLT\n", "2 UA N37422 PDX IAH\n", "3 US N547UW PDX CLT\n", "4 AS N762AS SEA ANC" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cat_df_flights.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "248" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cat_df_flights.isnull().values.sum()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "carrier 0\n", "tailnum 248\n", "origin 0\n", "dest 0\n", "dtype: int64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cat_df_flights.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "cat_df_flights = cat_df_flights.fillna(cat_df_flights['tailnum'].value_counts().index[0])" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cat_df_flights.isnull().values.sum()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "carrier_count = cat_df_flights['carrier'].value_counts()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AS 62460\n", "WN 23355\n", "OO 18710\n", "DL 16716\n", "UA 16671\n", "AA 7586\n", "US 5946\n", "B6 3540\n", "VX 3272\n", "F9 2698\n", "HA 1095\n", "Name: carrier, dtype: int64" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "carrier_count" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "11" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "carrier_count.count()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['AS', 'WN', 'OO', 'DL', 'UA', 'AA', 'US', 'B6', 'VX', 'F9', 'HA'], dtype='object')" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "carrier_count.index" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([62460, 23355, 18710, 16716, 16671, 7586, 5946, 3540, 3272,\n", " 2698, 1095], dtype=int64)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "carrier_count.values" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "sns.set_style('darkgrid')" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sns.barplot(carrier_count.index, carrier_count.values, alpha=0.9)\n", "plt.title('Frequency Distribution of Carriers')\n", "plt.xlabel('Carriers', fontsize=12)\n", "plt.ylabel('Number of Occurrences', fontsize=12)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.pie(carrier_count.values,\n", " labels=carrier_count.index,\n", " autopct='%1.1f%%', shadow=True)\n", "plt.axis('equal')\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Encoding Categorical Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Replacing values" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "replace_map_comp = {'carrier' :{val:idx \n", " for idx, val in enumerate(cat_df_flights['carrier']\n", " .astype('category')\n", " .cat.categories.tolist())}}" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'carrier': {'AA': 0,\n", " 'AS': 1,\n", " 'B6': 2,\n", " 'DL': 3,\n", " 'F9': 4,\n", " 'HA': 5,\n", " 'OO': 6,\n", " 'UA': 7,\n", " 'US': 8,\n", " 'VX': 9,\n", " 'WN': 10}}" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "replace_map_comp" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "cat_df_flights_replace = cat_df_flights.copy()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "cat_df_flights_replace.replace(replace_map_comp, inplace=True)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
carriertailnumorigindest
01N508ASPDXANC
18N195UWSEACLT
27N37422PDXIAH
38N547UWPDXCLT
41N762ASSEAANC
\n", "
" ], "text/plain": [ " carrier tailnum origin dest\n", "0 1 N508AS PDX ANC\n", "1 8 N195UW SEA CLT\n", "2 7 N37422 PDX IAH\n", "3 8 N547UW PDX CLT\n", "4 1 N762AS SEA ANC" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cat_df_flights_replace.head()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dtype('int64')" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cat_df_flights_replace['carrier'].dtypes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Encoding labels" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "cat_df_flights_lc = cat_df_flights.copy()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "# typecasting categorical features to a category dtype to make it faster than object dtype\n", "cat_df_flights_lc['carrier'] = cat_df_flights_lc['carrier'].astype('category')\n", "cat_df_flights_lc['origin'] = cat_df_flights_lc['origin'].astype('category')" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "cat_df_flights_lc['carrier'] = cat_df_flights_lc['carrier'].cat.codes" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
carriertailnumorigindest
01N508ASPDXANC
18N195UWSEACLT
27N37422PDXIAH
38N547UWPDXCLT
41N762ASSEAANC
\n", "
" ], "text/plain": [ " carrier tailnum origin dest\n", "0 1 N508AS PDX ANC\n", "1 8 N195UW SEA CLT\n", "2 7 N37422 PDX IAH\n", "3 8 N547UW PDX CLT\n", "4 1 N762AS SEA ANC" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cat_df_flights_lc.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "cat_df_flights_specific = cat_df_flights.copy()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "cat_df_flights_specific['US_code'] = np.where(cat_df_flights_specific['carrier'].str.contains('US'), 1, 0)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
carriertailnumorigindestUS_code
0ASN508ASPDXANC0
1USN195UWSEACLT1
2UAN37422PDXIAH0
3USN547UWPDXCLT1
4ASN762ASSEAANC0
\n", "
" ], "text/plain": [ " carrier tailnum origin dest US_code\n", "0 AS N508AS PDX ANC 0\n", "1 US N195UW SEA CLT 1\n", "2 UA N37422 PDX IAH 0\n", "3 US N547UW PDX CLT 1\n", "4 AS N762AS SEA ANC 0" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cat_df_flights_specific.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "cat_df_flights_sklearn = cat_df_flights.copy()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "lb_make = LabelEncoder()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "cat_df_flights_sklearn['carrier_code'] = lb_make.fit_transform(cat_df_flights_sklearn['carrier'])" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
carriertailnumorigindestcarrier_code
0ASN508ASPDXANC1
1USN195UWSEACLT8
2UAN37422PDXIAH7
3USN547UWPDXCLT8
4ASN762ASSEAANC1
\n", "
" ], "text/plain": [ " carrier tailnum origin dest carrier_code\n", "0 AS N508AS PDX ANC 1\n", "1 US N195UW SEA CLT 8\n", "2 UA N37422 PDX IAH 7\n", "3 US N547UW PDX CLT 8\n", "4 AS N762AS SEA ANC 1" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cat_df_flights_sklearn.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## One-Hot encoding" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "cat_df_flights_onehot = cat_df_flights.copy()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "# cat_df_flights_onehot = pd.get_dummies(cat_df_flights_onehot, columns=['carrier', 'dest'], prefix=['carrier', 'dest'])\n", "cat_df_flights_onehot = pd.get_dummies(cat_df_flights_onehot, columns=['carrier'], prefix=['carrier'])" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tailnumorigindestcarrier_AAcarrier_AScarrier_B6carrier_DLcarrier_F9carrier_HAcarrier_OOcarrier_UAcarrier_UScarrier_VXcarrier_WN
0N508ASPDXANC01000000000
1N195UWSEACLT00000000100
2N37422PDXIAH00000001000
3N547UWPDXCLT00000000100
4N762ASSEAANC01000000000
\n", "
" ], "text/plain": [ " tailnum origin dest carrier_AA carrier_AS carrier_B6 carrier_DL \\\n", "0 N508AS PDX ANC 0 1 0 0 \n", "1 N195UW SEA CLT 0 0 0 0 \n", "2 N37422 PDX IAH 0 0 0 0 \n", "3 N547UW PDX CLT 0 0 0 0 \n", "4 N762AS SEA ANC 0 1 0 0 \n", "\n", " carrier_F9 carrier_HA carrier_OO carrier_UA carrier_US carrier_VX \\\n", "0 0 0 0 0 0 0 \n", "1 0 0 0 0 1 0 \n", "2 0 0 0 1 0 0 \n", "3 0 0 0 0 1 0 \n", "4 0 0 0 0 0 0 \n", "\n", " carrier_WN \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 " ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cat_df_flights_onehot.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "cat_df_flights_onehot_sklearn = cat_df_flights.copy()" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "lb = LabelBinarizer()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "lb_result = lb.fit_transform(cat_df_flights_onehot_sklearn['carrier'])" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "lb_result_df = pd.DataFrame(lb_result, columns=lb.classes_)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAASB6DLF9HAOOUAUSVXWN
001000000000
100000000100
200000001000
300000000100
401000000000
\n", "
" ], "text/plain": [ " AA AS B6 DL F9 HA OO UA US VX WN\n", "0 0 1 0 0 0 0 0 0 0 0 0\n", "1 0 0 0 0 0 0 0 0 1 0 0\n", "2 0 0 0 0 0 0 0 1 0 0 0\n", "3 0 0 0 0 0 0 0 0 1 0 0\n", "4 0 1 0 0 0 0 0 0 0 0 0" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lb_result_df.head()" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "result_df = pd.concat([cat_df_flights_onehot_sklearn, lb_result_df], axis=1)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
carriertailnumorigindestAAASB6DLF9HAOOUAUSVXWN
0ASN508ASPDXANC01000000000
1USN195UWSEACLT00000000100
2UAN37422PDXIAH00000001000
3USN547UWPDXCLT00000000100
4ASN762ASSEAANC01000000000
\n", "
" ], "text/plain": [ " carrier tailnum origin dest AA AS B6 DL F9 HA OO UA US VX WN\n", "0 AS N508AS PDX ANC 0 1 0 0 0 0 0 0 0 0 0\n", "1 US N195UW SEA CLT 0 0 0 0 0 0 0 0 1 0 0\n", "2 UA N37422 PDX IAH 0 0 0 0 0 0 0 1 0 0 0\n", "3 US N547UW PDX CLT 0 0 0 0 0 0 0 0 1 0 0\n", "4 AS N762AS SEA ANC 0 1 0 0 0 0 0 0 0 0 0" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Binary encoding" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "cat_df_flights_ce = cat_df_flights.copy()" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "encoder = ce.BinaryEncoder(cols=['carrier'])" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "df_binary = encoder.fit_transform(cat_df_flights_ce)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
carrier_0carrier_1carrier_2carrier_3carrier_4tailnumorigindest
000001N508ASPDXANC
100010N195UWSEACLT
200011N37422PDXIAH
300010N547UWPDXCLT
400001N762ASSEAANC
\n", "
" ], "text/plain": [ " carrier_0 carrier_1 carrier_2 carrier_3 carrier_4 tailnum origin dest\n", "0 0 0 0 0 1 N508AS PDX ANC\n", "1 0 0 0 1 0 N195UW SEA CLT\n", "2 0 0 0 1 1 N37422 PDX IAH\n", "3 0 0 0 1 0 N547UW PDX CLT\n", "4 0 0 0 0 1 N762AS SEA ANC" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_binary.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Backward difference encoding" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "encoder = ce.BackwardDifferenceEncoder(cols=['carrier'])" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "df_bd = encoder.fit_transform(cat_df_flights_ce)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
interceptcarrier_0carrier_1carrier_2carrier_3carrier_4carrier_5carrier_6carrier_7carrier_8carrier_9tailnumorigindest
01-0.909091-0.818182-0.727273-0.636364-0.545455-0.454545-0.363636-0.272727-0.181818-0.090909N508ASPDXANC
110.090909-0.818182-0.727273-0.636364-0.545455-0.454545-0.363636-0.272727-0.181818-0.090909N195UWSEACLT
210.0909090.181818-0.727273-0.636364-0.545455-0.454545-0.363636-0.272727-0.181818-0.090909N37422PDXIAH
310.090909-0.818182-0.727273-0.636364-0.545455-0.454545-0.363636-0.272727-0.181818-0.090909N547UWPDXCLT
41-0.909091-0.818182-0.727273-0.636364-0.545455-0.454545-0.363636-0.272727-0.181818-0.090909N762ASSEAANC
\n", "
" ], "text/plain": [ " intercept carrier_0 carrier_1 carrier_2 carrier_3 carrier_4 \\\n", "0 1 -0.909091 -0.818182 -0.727273 -0.636364 -0.545455 \n", "1 1 0.090909 -0.818182 -0.727273 -0.636364 -0.545455 \n", "2 1 0.090909 0.181818 -0.727273 -0.636364 -0.545455 \n", "3 1 0.090909 -0.818182 -0.727273 -0.636364 -0.545455 \n", "4 1 -0.909091 -0.818182 -0.727273 -0.636364 -0.545455 \n", "\n", " carrier_5 carrier_6 carrier_7 carrier_8 carrier_9 tailnum origin dest \n", "0 -0.454545 -0.363636 -0.272727 -0.181818 -0.090909 N508AS PDX ANC \n", "1 -0.454545 -0.363636 -0.272727 -0.181818 -0.090909 N195UW SEA CLT \n", "2 -0.454545 -0.363636 -0.272727 -0.181818 -0.090909 N37422 PDX IAH \n", "3 -0.454545 -0.363636 -0.272727 -0.181818 -0.090909 N547UW PDX CLT \n", "4 -0.454545 -0.363636 -0.272727 -0.181818 -0.090909 N762AS SEA ANC " ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_bd.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Miscellaneous features" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "dummy_df_age = pd.DataFrame({'age':['0-20', '20-40', '40-60','60-80']})" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "dummy_df_age['start'], dummy_df_age['end'] = zip(*dummy_df_age['age'].map(lambda x: x.split('-')))" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agestartend
00-20020
120-402040
240-604060
360-806080
\n", "
" ], "text/plain": [ " age start end\n", "0 0-20 0 20\n", "1 20-40 20 40\n", "2 40-60 40 60\n", "3 60-80 60 80" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dummy_df_age.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "dummy_df_age = pd.DataFrame({'age':['0-20', '20-40', '40-60','60-80']})" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "def split_mean(x):\n", " split_list = x.split('-')\n", " mean = float(split_list[0]) + float(split_list[1]) / 2\n", " return mean" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "dummy_df_age['age_mean'] = dummy_df_age['age'].apply(lambda x: split_mean(x))" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ageage_mean
00-2010.0
120-4040.0
240-6070.0
360-80100.0
\n", "
" ], "text/plain": [ " age age_mean\n", "0 0-20 10.0\n", "1 20-40 40.0\n", "2 40-60 70.0\n", "3 60-80 100.0" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dummy_df_age.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.1" } }, "nbformat": 4, "nbformat_minor": 4 }